#Part I
In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv.
stid = student id year = year student watched video participation = whether or not the student opened the video watch.time = how long the student watched the video for confusion.points = how many times a student rewatched a section of a video key,points = how many times a student skipped or increased the speed of a video
#Load the package(s) you just installed
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
D1 <- read.csv("video-data.csv", header = TRUE)
#Create a data frame that only contains the years 2018
D2 <- filter(D1, year == 2018)
#Generate a histogram of the watch time for the year 2018
hist(D2$watch.time)
#Change the number of breaks to 100, do you get the same impression?
hist(D2$watch.time, breaks = 100)
#Cut the y-axis off at 10
hist(D2$watch.time, breaks = 100, ylim = c(0,10))
#Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35
hist(D2$watch.time, breaks = c(0,5,20,25,35))
#Plot the number of confusion points against the watch time
plot(D1$confusion.points, D1$watch.time)
#Create two variables x & y
x <- c(1,3,2,7,6,4,4)
y <- c(2,4,2,3,2,4,3)
#Create a table from x & y
table1 <- table(x,y)
#Display the table as a Barplot
barplot(table1)
#Create a data frame of the average total key points for each year and plot the two against each other as a lines
D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
## `summarise()` ungrouping output (override with `.groups` argument)
plot(D3$year, D3$mean_key, type = "l", lty = "dashed")
#Create a boxplot of total enrollment for three students
D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
#The drop levels command will remove all the schools from the variable with no data
D4 <- droplevels(D4)
boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time")
## Pairs
#Use matrix notation to select columns 2, 5, 6, and 7
D5 <- D1[,c(2,5,6,7)]
#Draw a matrix of plots for every combination of variables
pairs(D5)
## Part II
score <- rnorm(100,75,15)
hist(score, breaks = 30)
S1 <- data.frame(score)
library(dplyr)
S1 <- filter(S1, score <= 100)
hist(S1$score)
S2 <- data.frame(rep(100,100-nrow(S1)))
names(S2) <- "score"
S3 <- bind_rows(S1,S2)
S3$score <- round(S3$score,0)
interest <- c("sport","music","nature","literature")
S3$interest <- sample(interest,100,replace = TRUE)
S3$stid <- seq(1,100,1)
hist(S3$score,breaks = 10)
label <- letters[1:10]
S3$breaks <- cut(S3$score, breaks = 10, labels = label)
library(RColorBrewer)
#Let's look at the available palettes in RColorBrewer
display.brewer.all()
#The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging.
#Make RColorBrewer palette available to R and assign to your bins
S3$colors <- brewer.pal(10,"BrBG")
#Use named palette in histogram
hist(S3$score, breaks = 10, col = S3$colors)
#Make a vector of the colors from RColorBrewer
interest.col <- brewer.pal(4,"Set3")
boxplot(score~interest, S3, col = interest.col)
S3$login <- sample(1:25, 100,replace = TRUE)
plot(S3$login,S3$score, col = S3$colors, main = "Student Logins vs. Scores")
AP <- data.frame(AirPassengers)
plot(AirPassengers)
Iris <- data.frame(iris)
plot(Iris)
pairs(Iris)
Sepal.Length vs Petal.Length; Sepal.Length vs Petal.Width;Sepal Width vs Petal.Width; petal Length vs Petal.Width
In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository.
swirl-data.csv file called DF1DF1 <- read.csv("swirl-data.csv",header = TRUE)
The variables are:
course_name - the name of the R course the student attempted
lesson_name - the lesson name
question_number - the question number attempted correct - whether the question was answered correctly
attempt - how many times the student attempted the question
skipped - whether the student skipped the question
datetime - the date and time the student attempted the question
hash - anonymyzed student ID
hash, lesson_name and attempt called DF2DF2 <- select (DF1,hash,lesson_name,attempt)
group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3DF3 <- DF2 %>% group_by(hash,lesson_name) %>% summarize(TotalAttempts = sum(attempt))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names #
Convert DF3 to this format
library(ggplot2)
ggplot(DF3,aes(x=lesson_name,y=hash,color = TotalAttempts)) + geom_point() + expand_limits(y=0)
DF1 called DF4 that only includes the variables hash, lesson_name and correctDF4 <- select(DF1,hash,lesson_name,correct)
correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0DF4$correct <- ifelse(DF4$correct == "TRUE",1,0)
DF5 that provides a mean score for each student on each courseDF5 <- DF1%>% group_by(hash,course_name)%>%summarise(meanscore = mean(attempt))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each dayDF <- select(DF1,correct,datetime)
DF$correct <-ifelse(DF$correct == TRUE,1,0)
DF$datetime <- as.POSIXct(DF$datetime,origin = "1970-01-01 00:00.00 UTC")
DF$datetime <- strftime(DF$datetime,format = "%m-%d-%y")
DF6 <- summarise(group_by(DF,datetime),average = mean(correct,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)